Contents

import pandas as pd
import numpy as np
import sqlite3
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV

from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import f1_score, confusion_matrix, make_scorer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from ydata_profiling import ProfileReport

import mlflow
import mlflow.sklearn
from sklearn.base import BaseEstimator, TransformerMixin

loan_data = pd.read_csv('loan_data.csv')
connection = sqlite3.connect(":memory:")
cursor = connection.cursor()

cursor.execute("""
CREATE TABLE Person (
    person_id INTEGER PRIMARY KEY AUTOINCREMENT,
    person_age FLOAT,
    person_gender TEXT,
    person_education TEXT,
    person_income FLOAT,
    person_emp_exp INTEGER,
    person_home_ownership TEXT
);
""")

cursor.execute("""
CREATE TABLE Loan (
    loan_id INTEGER PRIMARY KEY AUTOINCREMENT,
    person_id INTEGER,
    loan_amnt FLOAT,
    loan_intent TEXT,
    loan_int_rate FLOAT,
    loan_percent_income FLOAT,
    loan_status INTEGER,
    FOREIGN KEY (person_id) REFERENCES Person (person_id)
);
""")

cursor.execute("""
CREATE TABLE CreditHistory (
    credit_id INTEGER PRIMARY KEY AUTOINCREMENT,
    person_id INTEGER,
    cb_person_cred_hist_length FLOAT,
    credit_score INTEGER,
    previous_loan_defaults_on_file TEXT,
    FOREIGN KEY (person_id) REFERENCES Person (person_id)
);
""")

person_columns = ['person_age', 'person_gender', 'person_education', 
                  'person_income', 'person_emp_exp', 'person_home_ownership']
person_data = loan_data[person_columns].drop_duplicates().reset_index(drop=True)
person_data['person_id'] = range(1, len(person_data) + 1)
person_data.to_sql('Person', connection, if_exists='append', index=False)

loan_columns = ['loan_amnt', 'loan_intent', 'loan_int_rate', 'loan_percent_income', 'loan_status']
loan_data_normalized = loan_data.merge(person_data, on=person_columns, how='left')
loan_data_normalized = loan_data_normalized[['person_id'] + loan_columns]
loan_data_normalized.to_sql('Loan', connection, if_exists='append', index=False)

credit_columns = ['cb_person_cred_hist_length', 'credit_score', 'previous_loan_defaults_on_file']
credit_data = loan_data[credit_columns].join(person_data[['person_id']])
credit_data.to_sql('CreditHistory', connection, if_exists='append', index=False)

print("Person table count:", cursor.execute("SELECT COUNT(*) FROM Person;").fetchone()[0])
print("Loan table count:", cursor.execute("SELECT COUNT(*) FROM Loan;").fetchone()[0])
print("Credit History table count:", cursor.execute("SELECT COUNT(*) FROM CreditHistory;").fetchone()[0])
Person table count: 44983
Loan table count: 45000
Credit History table count: 45000
query = """
SELECT 
    p.person_age,
    p.person_gender,
    p.person_education,
    p.person_income,
    p.person_emp_exp,
    p.person_home_ownership,
    l.loan_amnt,
    l.loan_intent,
    l.loan_int_rate,
    l.loan_percent_income,
    l.loan_status,
    c.cb_person_cred_hist_length,
    c.credit_score,
    c.previous_loan_defaults_on_file
FROM Person p
JOIN Loan l 
ON p.person_id = l.person_id
JOIN CreditHistory c 
ON p.person_id = c.person_id;
"""
loan_data_full = pd.read_sql_query(query, connection)
print(loan_data_full.head())
   person_age person_gender person_education  person_income  person_emp_exp  \
0        22.0        female           Master        71948.0               0   
1        21.0        female      High School        12282.0               0   
2        25.0        female      High School        12438.0               3   
3        23.0        female         Bachelor        79753.0               0   
4        24.0          male           Master        66135.0               1   

  person_home_ownership  loan_amnt loan_intent  loan_int_rate  \
0                  RENT    35000.0    PERSONAL          16.02   
1                   OWN     1000.0   EDUCATION          11.14   
2              MORTGAGE     5500.0     MEDICAL          12.87   
3                  RENT    35000.0     MEDICAL          15.23   
4                  RENT    35000.0     MEDICAL          14.27   

   loan_percent_income  loan_status  cb_person_cred_hist_length  credit_score  \
0                 0.49            1                         3.0           561   
1                 0.08            0                         2.0           504   
2                 0.44            1                         3.0           635   
3                 0.44            1                         2.0           675   
4                 0.53            1                         4.0           586   

  previous_loan_defaults_on_file  
0                             No  
1                            Yes  
2                             No  
3                             No  
4                             No  
print("Loan Status Distribution:")
print(loan_data_full['loan_status'].value_counts(normalize=True))

sns.countplot(data=loan_data_full, x='loan_status')
plt.title("Distribution of Loan Status")
plt.xlabel("Loan Status")
plt.ylabel("Count")
plt.show()
Loan Status Distribution:
loan_status
0    0.777778
1    0.222222
Name: proportion, dtype: float64
_images/1158dfaeaf95525b5d440e2a5a093e263eae1153e444c1b2b1055957f974d17b.png
#train/test split with stratification
X = loan_data_full.drop('loan_status', axis=1)  
y = loan_data_full['loan_status'] 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  
)
profile = ProfileReport(loan_data, title="Loan Data Profiling Report", explorative=True)
profile

numeric_columns = loan_data_full.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_columns.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix")
plt.show()
_images/bed1799dd6e0caf5525583311e6a1d0a94bd3f74dd8f9e9bdaa802c8dbe0aaa0.png
def cap_outliers(series, lower_percentile=1, upper_percentile=99):
    lower_bound = np.percentile(series, lower_percentile)
    upper_bound = np.percentile(series, upper_percentile)
    return np.clip(series, lower_bound, upper_bound)

features_with_outliers = ['person_age', 'person_emp_exp', 'loan_amnt', 'cb_person_cred_hist_length', 'credit_score']
for feature in features_with_outliers:
    loan_data[feature] = cap_outliers(loan_data[feature])

loan_data.drop(columns=['person_emp_exp'], inplace=True)

numerical_features = ['person_age', 'person_income', 'loan_amnt', 
                      'loan_int_rate', 'loan_percent_income', 
                      'cb_person_cred_hist_length', 'credit_score']

scaler = StandardScaler()
loan_data[numerical_features] = scaler.fit_transform(loan_data[numerical_features])

categorical_columns = ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_features = encoder.fit_transform(loan_data[categorical_columns])

encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))
loan_data = pd.concat([loan_data.drop(columns=categorical_columns), encoded_df], axis=1)
print(loan_data.head())
   person_age  person_income  loan_amnt  loan_int_rate  loan_percent_income  \
0   -1.014624      -0.104090   3.061053       1.683039             4.016394   
1   -1.193116      -0.846005  -1.364261       0.044782            -0.684829   
2   -0.479148      -0.844065  -0.656065       0.625557             3.443074   
3   -0.836132      -0.007039   3.061053       1.417829             3.443074   
4   -0.657640      -0.176371   3.061053       1.095549             4.475050   

   cb_person_cred_hist_length  credit_score  loan_status  person_gender_male  \
0                   -0.766708     -1.445292            1                 0.0   
1                   -1.038860     -2.594200            0                 0.0   
2                   -0.766708      0.046274            1                 0.0   
3                   -1.038860      0.852526            1                 0.0   
4                   -0.494555     -0.941384            1                 1.0   

   person_education_Bachelor  ...  person_education_Master  \
0                        0.0  ...                      1.0   
1                        0.0  ...                      0.0   
2                        0.0  ...                      0.0   
3                        1.0  ...                      0.0   
4                        0.0  ...                      1.0   

   person_home_ownership_OTHER  person_home_ownership_OWN  \
0                          0.0                        0.0   
1                          0.0                        1.0   
2                          0.0                        0.0   
3                          0.0                        0.0   
4                          0.0                        0.0   

   person_home_ownership_RENT  loan_intent_EDUCATION  \
0                         1.0                    0.0   
1                         0.0                    1.0   
2                         0.0                    0.0   
3                         1.0                    0.0   
4                         1.0                    0.0   

   loan_intent_HOMEIMPROVEMENT  loan_intent_MEDICAL  loan_intent_PERSONAL  \
0                          0.0                  0.0                   1.0   
1                          0.0                  0.0                   0.0   
2                          0.0                  1.0                   0.0   
3                          0.0                  1.0                   0.0   
4                          0.0                  1.0                   0.0   

   loan_intent_VENTURE  previous_loan_defaults_on_file_Yes  
0                  0.0                                 0.0  
1                  0.0                                 1.0  
2                  0.0                                 0.0  
3                  0.0                                 0.0  
4                  0.0                                 0.0  

[5 rows x 22 columns]
loan_data.isna().sum()
person_age                            0
person_income                         0
loan_amnt                             0
loan_int_rate                         0
loan_percent_income                   0
cb_person_cred_hist_length            0
credit_score                          0
loan_status                           0
person_gender_male                    0
person_education_Bachelor             0
person_education_Doctorate            0
person_education_High School          0
person_education_Master               0
person_home_ownership_OTHER           0
person_home_ownership_OWN             0
person_home_ownership_RENT            0
loan_intent_EDUCATION                 0
loan_intent_HOMEIMPROVEMENT           0
loan_intent_MEDICAL                   0
loan_intent_PERSONAL                  0
loan_intent_VENTURE                   0
previous_loan_defaults_on_file_Yes    0
dtype: int64
# Set environment variables for MLflow and DagsHub
os.environ['MLFLOW_TRACKING_URI'] = "https://dagshub.com/tejaswini161/my-first-repo.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = "tejaswini161"
os.environ['MLFLOW_TRACKING_PASSWORD'] = "86ddd17b46cb18df8455f352e782088c0bcf0652"
X = loan_data.drop(columns=['loan_status'])  
y = loan_data['loan_status'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
numeric_features = X_train.columns.tolist()

def shift_positive(X):
    min_val = X.min(axis=0)
    shift = np.abs(min_val) + 1 
    return X + shift

# numeric_imputer = SimpleImputer(strategy='mean')

numeric_transformer = Pipeline(steps=[
    ('shift_positive', FunctionTransformer(shift_positive, validate=False)), 
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ]
)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10], 
    'classifier__solver': ['lbfgs', 'liblinear']  
}

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=make_scorer(f1_score),
    cv=skf,  
    n_jobs=-1
)

mlflow.set_experiment("Loan Prediction Experiment #1")
with mlflow.start_run():
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    mlflow.log_params(best_params)
    cv_results = grid_search.cv_results_
    cv_f1_mean = np.mean(cv_results['mean_test_score'])
    cv_f1_std = np.std(cv_results['mean_test_score'])
    mlflow.log_metric("cv_f1_mean", cv_f1_mean)
    mlflow.log_metric("cv_f1_std", cv_f1_std)
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    mlflow.log_metric("test_f1", test_f1)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    mlflow.log_metric("true_positive", tp)
    mlflow.log_metric("true_negative", tn)
    mlflow.log_metric("false_positive", fp)
    mlflow.log_metric("false_negative", fn)
    mlflow.sklearn.log_model(best_model, "loan_prediction_model")

print(f"Best Parameters: {best_params}")
print(f"Cross-Validation F1 Score: Mean = {cv_f1_mean:.4f}, Std = {cv_f1_std:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Confusion Matrix: TP = {tp}, TN = {tn}, FP = {fp}, FN = {fn}")
๐Ÿƒ View run salty-auk-593 at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/0/runs/d62c231dfd0345a58d863191865cda6b
๐Ÿงช View experiment at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/0
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42,max_iter=500),
    'Ridge Classifier': RidgeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'XGB Classifier': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
mlflow.set_experiment("Loan Prediction Experiment #2")

for name, clf in classifiers.items():
    with mlflow.start_run(run_name=name):
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', clf)
        ])
        f1_scorer = make_scorer(f1_score)
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring=f1_scorer, n_jobs=-1)
        mlflow.log_param("model", name)
        mlflow.log_metric("cv_f1_mean", np.mean(cv_scores))
        mlflow.log_metric("cv_f1_std", np.std(cv_scores))
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        test_f1 = f1_score(y_test, y_pred)
        mlflow.log_metric("test_f1", test_f1)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        mlflow.log_metric("true_positive", tp)
        mlflow.log_metric("true_negative", tn)
        mlflow.log_metric("false_positive", fp)
        mlflow.log_metric("false_negative", fn)
        mlflow.sklearn.log_model(pipeline, f"{name}_model")
        
        print(f"{name} Results:")
        print(f"CV F1 Score: Mean = {np.mean(cv_scores):.4f}, Std = {np.std(cv_scores):.4f}")
        print(f"Test F1 Score: {test_f1:.4f}")
        print(f"Confusion Matrix: TP = {tp}, TN = {tn}, FP = {fp}, FN = {fn}")
2024/12/21 22:22:24 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Logistic Regression Results:
CV F1 Score: Mean = 0.7589, Std = 0.0083
Test F1 Score: 0.7690
Confusion Matrix: TP = 1500, TN = 6599, FP = 401, FN = 500
๐Ÿƒ View run Logistic Regression at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/1/runs/055c25b73f814926a6ff32cf5038480b
๐Ÿงช View experiment at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/1
2024/12/21 22:22:37 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Ridge Classifier Results:
CV F1 Score: Mean = 0.7415, Std = 0.0118
Test F1 Score: 0.7591
Confusion Matrix: TP = 1407, TN = 6700, FP = 300, FN = 593
๐Ÿƒ View run Ridge Classifier at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/1/runs/77ded353408d4e44978184a978cb9c20
๐Ÿงช View experiment at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/1
2024/12/21 22:23:04 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Random Forest Results:
CV F1 Score: Mean = 0.8236, Std = 0.0132
Test F1 Score: 0.8279
Confusion Matrix: TP = 1535, TN = 6827, FP = 173, FN = 465
๐Ÿƒ View run Random Forest at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/1/runs/753db56a644f4ea18a34d202b3696a9f
๐Ÿงช View experiment at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/1
C:\Users\Tejeswini\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:23:30] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
2024/12/21 22:23:42 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
XGB Classifier Results:
CV F1 Score: Mean = 0.8350, Std = 0.0152
Test F1 Score: 0.8496
Confusion Matrix: TP = 1630, TN = 6793, FP = 207, FN = 370
๐Ÿƒ View run XGB Classifier at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/1/runs/62ca3440cb95456aa024b58c41bc3c99
๐Ÿงช View experiment at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/1
mlflow.set_experiment("Loan Prediction Experiment #3")
with mlflow.start_run():
    mlflow.log_param("feature_engineering", [
        "income_to_loan_ratio = person_income / loan_amnt",
        "employment_to_age_ratio = person_emp_exp / (person_age + 1)",
        "interest_to_income_ratio = loan_int_rate / (person_income + 1)"
    ])
    f1_scorer = make_scorer(f1_score)
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring=f1_scorer, n_jobs=-1)
    mlflow.log_metric("cv_f1_mean", np.mean(cv_scores))
    mlflow.log_metric("cv_f1_std", np.std(cv_scores))
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    mlflow.log_metric("test_f1", test_f1)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    mlflow.log_metric("true_positive", tp)
    mlflow.log_metric("true_negative", tn)
    mlflow.log_metric("false_positive", fp)
    mlflow.log_metric("false_negative", fn)
    mlflow.sklearn.log_model(pipeline, "loan_prediction_model_with_feature_engineering")

    print("Feature Engineering:")
    print(f"CV F1 Score: Mean = {np.mean(cv_scores):.4f}, Std = {np.std(cv_scores):.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")
    print(f"Confusion Matrix: TP = {tp}, TN = {tn}, FP = {fp}, FN = {fn}")
C:\Users\Tejeswini\anaconda3\Lib\site-packages\xgboost\core.py:158: UserWarning: [22:04:06] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
2024/12/21 22:04:18 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Feature Engineering:
CV F1 Score: Mean = 0.8350, Std = 0.0152
Test F1 Score: 0.8496
Confusion Matrix: TP = 1630, TN = 6793, FP = 207, FN = 370
๐Ÿƒ View run smiling-ram-480 at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/2/runs/df69a7bad5a14cee88246c2ef9f811cf
๐Ÿงช View experiment at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/2
X = loan_data.drop(columns=['loan_status'])  
y = loan_data['loan_status'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

class VarianceThresholdSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.01):
        self.threshold = threshold
        self.selector = None

    def fit(self, X, y=None):
        self.selector = VarianceThreshold(threshold=self.threshold)
        self.selector.fit(X)
        return self

    def transform(self, X):
        return pd.DataFrame(self.selector.transform(X), columns=X.columns[self.selector.get_support()])

class CorrelationThresholdSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.85):
        self.threshold = threshold
        self.to_drop = []

    def fit(self, X, y=None):
        corr_matrix = X.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.to_drop = [column for column in upper.columns if any(upper[column] > self.threshold)]
        return self

    def transform(self, X):
        return X.drop(columns=self.to_drop)

class FeatureImportanceSelector(BaseEstimator, TransformerMixin):
    def __init__(self, model=RandomForestClassifier(random_state=42)):
        self.model = model
        self.selected_features = []

    def fit(self, X, y):
        self.model.fit(X, y)
        importances = self.model.feature_importances_
        threshold = np.mean(importances)
        self.selected_features = X.columns[importances >= threshold]
        return self

    def transform(self, X):
        return X[self.selected_features]

feature_selection_pipeline = Pipeline(steps=[
    ('variance_threshold', VarianceThresholdSelector(threshold=0.01)),
    ('correlation_threshold', CorrelationThresholdSelector(threshold=0.85)),
    ('feature_importance', FeatureImportanceSelector(model=RandomForestClassifier(random_state=42)))
])

pipeline = Pipeline(steps=[
    ('feature_selection', feature_selection_pipeline),
    ('classifier', LogisticRegression(random_state=42))
])

f1_scorer = make_scorer(f1_score)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring=f1_scorer, n_jobs=-1)

mlflow.set_experiment("Loan Prediction Experiment #4")
with mlflow.start_run():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    mlflow.log_metric("cv_f1_mean", np.mean(cv_scores))
    mlflow.log_metric("cv_f1_std", np.std(cv_scores))
    mlflow.log_metric("test_f1", test_f1)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    mlflow.log_metric("true_positive", tp)
    mlflow.log_metric("true_negative", tn)
    mlflow.log_metric("false_positive", fp)
    mlflow.log_metric("false_negative", fn)
    mlflow.sklearn.log_model(pipeline, "loan_prediction_model_with_feature_selection")
print("Feature Selection:")
print(f"CV F1 Score: Mean = {np.mean(cv_scores):.4f}, Std = {np.std(cv_scores):.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Confusion Matrix: TP = {tp}, TN = {tn}, FP = {fp}, FN = {fn}")
2024/12/21 22:12:47 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
๐Ÿƒ View run delightful-bird-741 at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/3/runs/8e2c976409f84dcb96505223eb01eb80
๐Ÿงช View experiment at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/3
Feature Selection:
CV F1 Score: Mean = 0.7479, Std = 0.0051
Test F1 Score: 0.7624
Confusion Matrix: TP = 1495, TN = 6573, FP = 427, FN = 505
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_variance_ratio, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot')
plt.grid()
plt.savefig("scree_plot.png", dpi=300) 
plt.show()

n_components = np.argmax(cumulative_variance_ratio >= 0.95) + 1
print(f"Number of components to retain 95% variance: {n_components}")

pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),  
    ('pca', PCA(n_components=n_components)), 
    ('classifier', LogisticRegression(random_state=42))
])

f1_scorer = make_scorer(f1_score)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring=f1_scorer, n_jobs=-1)

mlflow.set_experiment("Loan Prediction Experiment #5")
with mlflow.start_run():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    mlflow.log_param("n_components", n_components)
    mlflow.log_artifact("scree_plot.png", artifact_path="plots")
    mlflow.log_metric("cv_f1_mean", np.mean(cv_scores))
    mlflow.log_metric("cv_f1_std", np.std(cv_scores))
    mlflow.log_metric("test_f1", test_f1)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    mlflow.log_metric("true_positive", tp)
    mlflow.log_metric("true_negative", tn)
    mlflow.log_metric("false_positive", fp)
    mlflow.log_metric("false_negative", fn)
    mlflow.sklearn.log_model(pipeline, "loan_prediction_model_with_pca")

print(f"CV F1 Score: Mean = {np.mean(cv_scores):.4f}, Std = {np.std(cv_scores):.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Confusion Matrix: TP = {tp}, TN = {tn}, FP = {fp}, FN = {fn}")
_images/30386e399cef532c13663dffd430fbde42c7eacf2a75efdc09f22dbcd336c7bc.png
Number of components to retain 95% variance: 17
2024/12/21 22:13:22 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
๐Ÿƒ View run unruly-horse-204 at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/4/runs/6f125609f5474c57b4eb896d4b13b9f9
๐Ÿงช View experiment at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/4
CV F1 Score: Mean = 0.7595, Std = 0.0105
Test F1 Score: 0.7681
Confusion Matrix: TP = 1499, TN = 6596, FP = 404, FN = 501
n_components = 17 

pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])
f1_scorer = make_scorer(f1_score)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring=f1_scorer, n_jobs=-1)

mlflow.set_experiment("Loan Prediction Experiment #6")
with mlflow.start_run():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    mlflow.log_param("n_components", n_components)
    mlflow.log_param("classifier", "RandomForestClassifier")
    mlflow.log_metric("cv_f1_mean", np.mean(cv_scores))
    mlflow.log_metric("cv_f1_std", np.std(cv_scores))
    mlflow.log_metric("test_f1", test_f1)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    mlflow.log_metric("true_positive", tp)
    mlflow.log_metric("true_negative", tn)
    mlflow.log_metric("false_positive", fp)
    mlflow.log_metric("false_negative", fn)
    mlflow.sklearn.log_model(pipeline, "loan_prediction_model_custom_experiment")
    
print(f"Custom Experiment Results:")
print(f"CV F1 Score: Mean = {np.mean(cv_scores):.4f}, Std = {np.std(cv_scores):.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Confusion Matrix: TP = {tp}, TN = {tn}, FP = {fp}, FN = {fn}")
2024/12/21 22:13:46 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
๐Ÿƒ View run magnificent-mink-419 at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/5/runs/bde25646c84740eca437f2bd62c25d6d
๐Ÿงช View experiment at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/5
Custom Experiment Results:
CV F1 Score: Mean = 0.8255, Std = 0.0029
Test F1 Score: 0.8273
Confusion Matrix: TP = 1533, TN = 6827, FP = 173, FN = 467
print(y.value_counts(normalize=True))
loan_status
0    0.777778
1    0.222222
Name: proportion, dtype: float64
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

f1_scorer = make_scorer(f1_score)
cv_scores = cross_val_score(pipeline, X_train_resampled, y_train_resampled, cv=3, scoring=f1_scorer, n_jobs=-1)

mlflow.set_experiment("Loan Prediction Experiment #7")
with mlflow.start_run():
    pipeline.fit(X_train_resampled, y_train_resampled)
    y_pred = pipeline.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    mlflow.log_param("oversampling_technique", "SMOTE")
    mlflow.log_param("classifier", "GradientBoostingClassifier")
    mlflow.log_metric("cv_f1_mean", np.mean(cv_scores))
    mlflow.log_metric("cv_f1_std", np.std(cv_scores))
    mlflow.log_metric("test_f1", test_f1)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    mlflow.log_metric("true_positive", tp)
    mlflow.log_metric("true_negative", tn)
    mlflow.log_metric("false_positive", fp)
    mlflow.log_metric("false_negative", fn)
    mlflow.sklearn.log_model(pipeline, "loan_prediction_model_with_smote")

print(f"Custom Experiment Results:")
print(f"CV F1 Score: Mean = {np.mean(cv_scores):.4f}, Std = {np.std(cv_scores):.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Confusion Matrix: TP = {tp}, TN = {tn}, FP = {fp}, FN = {fn}")
2024/12/21 22:14:43 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
๐Ÿƒ View run defiant-colt-205 at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/6/runs/f14a8996c80e48debc9e046065877aac
๐Ÿงช View experiment at: https://dagshub.com/tejaswini161/my-first-repo.mlflow/#/experiments/6
Custom Experiment Results:
CV F1 Score: Mean = 0.9124, Std = 0.0209
Test F1 Score: 0.7924
Confusion Matrix: TP = 1763, TN = 6313, FP = 687, FN = 237
experiments = [
    'Experiment #1: Logistic Regression',
    'Experiment #2.1: Logistic Regression',
    'Experiment #2.2: Ridge Classifier',
    'Experiment #2.3: Random Forest',
    'Experiment #2.4: XGB Classifier',
    'Experiment #3: Feature Engineering',
    'Experiment #4: Feature Selection',
    'Experiment #5: PCA + Logistic Regression',
    'Experiment #6: Random Forest',
    'Experiment #7: SMOTE + Gradient Boosting'
]

cv_f1_means = [
    0.7570,  # Experiment #1
    0.7596,  # Experiment #2.1: Logistic Regression
    0.7415,  # Experiment #2.2: Ridge Classifier
    0.8236,  # Experiment #2.3: Random Forest
    0.8350,  # Experiment #2.4: XGB Classifier
    0.7881,  # Experiment #3
    0.7479,  # Experiment #4
    0.7595,  # Experiment #5
    0.8255,  # Experiment #6
    0.9124   # Experiment #7
]

cv_f1_stds = [
    0.0037,  # Experiment #1
    0.0085,  # Experiment #2.1: Logistic Regression
    0.0118,  # Experiment #2.2: Ridge Classifier
    0.0132,  # Experiment #2.3: Random Forest
    0.0152,  # Experiment #2.4: XGB Classifier
    0.0113,  # Experiment #3
    0.0051,  # Experiment #4
    0.0105,  # Experiment #5
    0.0029,  # Experiment #6
    0.0209   # Experiment #7
]

# Test F1 Scores
test_f1_scores = [
    0.7688,  # Experiment #1
    0.7693,  # Experiment #2.1: Logistic Regression
    0.7591,  # Experiment #2.2: Ridge Classifier
    0.8279,  # Experiment #2.3: Random Forest
    0.8496,  # Experiment #2.4: XGB Classifier
    0.8000,  # Experiment #3
    0.7624,  # Experiment #4
    0.7681,  # Experiment #5
    0.8273,  # Experiment #6
    0.7924   # Experiment #7
]

plt.figure(figsize=(12, 6))
plt.barh(experiments, cv_f1_means, xerr=cv_f1_stds, color='skyblue', edgecolor='black')
plt.xlabel('F1-Score (Mean ยฑ Std)')
plt.title('Cross-Validation F1-Scores Across Experiments')
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig("cv_f1_scores.png", dpi=300) 
plt.show()


plt.figure(figsize=(12, 6))
plt.barh(experiments, test_f1_scores, color='lightgreen', edgecolor='black')
plt.xlabel('F1-Score')
plt.title('Test F1-Scores Across Experiments')
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig("test_f1_scores.png", dpi=300)  
plt.show()
_images/93fa978210e0b12034e4e8b0cb39b578eb5071ad765f7fc6ce143c282231716b.png _images/06a0b7c6729b1660a24aec625df02534b3ea8ad54cae797ea3aa44db96c01a14.png
print('Best Model is XGB Classifier as it achieves the highest test F1-score (0.8496), indicating the best real-world generalization on unseen data.')
print('With a mean F1-score of 0.8350 and standard deviation of 0.0152, it is consistent during cross-validation.')
print('Unlike Experiment #7 (SMOTE + Gradient Boosting), which shows potential overfitting that is high cross-validation but lower test F1, XGB Classifier provides a reliable balance between training and test performance.')
Best Model is XGB Classifier as it achieves the highest test F1-score (0.8496), indicating the best real-world generalization on unseen data.
With a mean F1-score of 0.8350 and standard deviation of 0.0152, it is consistent during cross-validation.
Unlike Experiment #7 (SMOTE + Gradient Boosting), which shows potential overfitting that is high cross-validation but lower test F1, XGB Classifier provides a reliable balance between training and test performance.
feature_selection_pipeline.fit(X_train, y_train)

variance_selected_features = feature_selection_pipeline.named_steps['variance_threshold'].selector.get_support()
variance_features = X_train.columns[variance_selected_features]
print("Features after Variance Threshold:")
print(variance_features.tolist())

correlation_dropped_features = feature_selection_pipeline.named_steps['correlation_threshold'].to_drop
correlation_features = variance_features.difference(correlation_dropped_features)
print("\nFeatures after Correlation Threshold:")
print(correlation_features.tolist())

importance_selected_features = feature_selection_pipeline.named_steps['feature_importance'].selected_features
print("\nTop Features after Feature Importance:")
print(importance_selected_features.tolist())

top_features = importance_selected_features
Features after Variance Threshold:
['person_age', 'person_income', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score', 'person_gender_male', 'person_education_Bachelor', 'person_education_Doctorate', 'person_education_High School', 'person_education_Master', 'person_home_ownership_OWN', 'person_home_ownership_RENT', 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'previous_loan_defaults_on_file_Yes']

Features after Correlation Threshold:
['credit_score', 'loan_amnt', 'loan_int_rate', 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_percent_income', 'person_age', 'person_education_Bachelor', 'person_education_Doctorate', 'person_education_High School', 'person_education_Master', 'person_gender_male', 'person_home_ownership_OWN', 'person_home_ownership_RENT', 'person_income', 'previous_loan_defaults_on_file_Yes']

Top Features after Feature Importance:
['person_income', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'credit_score', 'person_home_ownership_RENT', 'previous_loan_defaults_on_file_Yes']
import joblib
final_model = XGBClassifier(random_state=42)
final_model.fit(X[top_features],y)
joblib.dump(final_model, "loan_prediction_model.pkl")
print("Model saved as loan_prediction_model.pkl")
Model saved as loan_prediction_model.pkl